#!/usr/bin/env python
# 
# Lookup references on Google Scholar and add a URL which points to a copy of the paper.
#
# Nice to complete the bibliography, and if you want to convert the bibliography to HTML
# (with bib2html).
#


# Copyright (c) 2007, Peter Corke
#
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * The name of the copyright holder may not be used to endorse or 
#	promote products derived from this software without specific prior 
#	written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.

import Bibliography;
import BibEntry;
import BibTeX;
import string;
import sys;
import re;
import urllib
import urlparse
import htmllib
import formatter
import time;
import optparse;

# preferred sources of documents
prefList = ['ieeexplore.ieee.org', 'portal.acm.org', 'doi.ieeecomputersociety.org'];

class Parser(htmllib.HTMLParser):
    # build a list of tuples (anchor text, URL)

    def __init__(self, verbose=0):
        self.anchors = [];
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f, verbose)

    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.href = href
	self.name = name
	self.type = type

    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.href and text:
            #self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
            #self.anchors[text] = self.anchor
	    self.anchors.append( (text, self.href) );

# trick Google into thinking I'm using Safari
browserName = "Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/312.1 (KHTML, like Gecko) Safari/312";

class AppURLopener(urllib.FancyURLopener):
    version = browserName;

urllib._urlopener = AppURLopener()

## lookup the BibEntry on Google scholar
def scholar_lookup(be):

	# Levenstein distance between two strings
	def distance(a,b):
	    c = {}
	    n = len(a); m = len(b)

	    for i in range(0,n+1):
		c[i,0] = i
	    for j in range(0,m+1):
		c[0,j] = j
		
	    for i in range(1,n+1):
		for j in range(1,m+1):
		    x = c[i-1,j]+1
		    y = c[i,j-1]+1
		    if a[i-1] == b[j-1]:
			z = c[i-1,j-1]
		    else:
			z = c[i-1,j-1]+1
		    c[i,j] = min(x,y,z)
	    return c[n,m]

	# build the search string from words in the title and authors surnames
	#   - remove short words and accents, punctuation characters
	title = be.getTitle().split();
	newtitle = [];
	for word in title:
		if len(word) >= 4:
			newtitle.append(word);
	title = string.join(newtitle, ' ');
	title =  re.sub(r"""[#{}:;,&$-]""", " ", title);

	search = title.split();
	
	# add the year
	year = be.getYear();
	if year > 0:
		#search.append(repr(year));
		pass

	# add author surnames
	search.extend( [x[0] for x in be.getAuthorsSurnameList()]);

	# remove accents and apostrophes, quotes
	search2 = [];
	for w in search:
		w = re.sub(r"""\.|['"]""", "", w);
		search2.append(w);
	search = search2;
	#print string.join(search,' ');

	s = "http://www.scholar.google.com/scholar?q=%s&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search" % ( string.join(search, '+') );

	# send the query to Scholar
	file = urllib.urlopen(s);
	html = file.read()
	file.close()

	# parse the result
	p = Parser()
	p.feed(html)
	p.close()


	candidates = [];
	
	title = be.getTitle().lower();
	# for each returned result, look for the best one
	#print p.anchors
	for text, url in p.anchors:
	    #print text, "|", url

	    # find the distance between our known title and the title of the article
	    d = distance(text.lower(), title);
	    #print d, k
	    if d < 5:
	    	# consider this a good enough match
		i = url.find("http");
		candidates.append( url[i:] );
	
	    # look for a URL of the form http:....pdf
	    i = url.find("pdf");
	    if i == 0:
		i = url.find("http");
		#print " ** PDF ", url[i:]
		candidates.append( url[i:] );

	# now we have a list of candidate URLs

	#print candidates
	
	# look for a source in our preference list
	for url in candidates:
		org = urlparse.urlsplit(url)[1];
		if org in prefList:
			return url;

	# failing that go for one with a PDF in it
	for url in candidates:
		if url.find("pdf") > -1:
			return url;	

	# failing that take the first one
	if candidates:
		return candidates[0];
			
	return None;


## main

## parse switches
usage = '''usage: %prog [options] [bibfiles]

:: Lookup each reference on Google Scholar and add the URL to the bibliography.'''
p = optparse.OptionParser(usage)
p.add_option('-v', '--verbose', dest='verbose', action='store_true',
             help='print some extra information');
p.set_defaults(verbose=False);
(opts, args) = p.parse_args()
globals().update(opts.__dict__)

if len(args) == 0 and sys.stdin.isatty():
	p.print_help();
	sys.exit(0);

## read the input files	
bib = BibTeX.BibTeX();
if args:
	for f in args:
		bib.parseFile(f);
else:
	bib.parseFile();
			
## lookup each reference on Scholar
count = 0;
sourceDict = {};

if verbose:
	print >> sys.stderr, "Resolving %d references via Google scholar" % len(bib);

for be in bib:

	rt = be.getRefType();
	if rt in ['article', 'inproceedings']:
		# if we already have a URL then skip
		if be.getURL():
			continue;

		# do the lookup
		url = scholar_lookup(be);
		if url:
			if verbose:
				print >> sys.stderr, be;
				print >> sys.stderr, "  --> ", url
				print >> sys.stderr
			be.setField('Url', url);
			count = count + 1;
			
			# build a list of the unique sources of the documents
			org = urlparse.urlsplit(url)[1];
			if org in sourceDict:
				sourceDict[org] += 1;
			else:
				sourceDict[org] = 1;

if verbose:
	# print some stats
	print >> sys.stderr, "Resolved %d references to URLs (%.1f%%)" % (count, count*100./len(bib));

	# print the unique source list, sorted in decreasing order of frequency
	print >> sys.stderr, "Document sources"
	l = sourceDict.items();
	l.sort( lambda x, y: cmp(y[1], x[1]) );

	for org,n in l:
		print >> sys.stderr, "    %-30s %d" % (org, n);

# output the bibligraphy with the URLs set
bib.writeStrings();
bib.write();
